Task: Create this plot¶

1-30-26 Update

make this (or a simialar) scatterplot for E8, E10 and MEF RNA data with overlap for genes (and if you have time also TEs) image.png

In [1]:
# Import libraries
suppressPackageStartupMessages({
  library(DESeq2)
  library(RColorBrewer)
  library(dplyr)
  library(tidyr)
  library(plotly)
  library(ggplot2)
  library(stringr)
  require(httr)
  require(jsonlite)
  library(ggrepel)
  library(cowplot)    
})
In [2]:
# Set working directory to correct directory. Change as needed
# Note: for this project, several necessary files are stored here
setwd("/data/gallegosda/GEO/RNA-seq/pca_ma_plots/final/")

Use actual chip-target data¶

In [3]:
peakFileAnnotated = "shared_777_mESC_peaks_EGFP_clone2_3_over_21_3163_mm10_annotated.csv"
In [4]:
genes_near_ESC_peaks_df <- read.csv(peakFileAnnotated)
colnames(genes_near_ESC_peaks_df)[1] <- "PeakID"
In [5]:
head(genes_near_ESC_peaks_df, n=3)
A data.frame: 3 × 19
PeakIDChrStartEndStrandPeak.ScoreFocus.Ratio.Region.SizeAnnotationDetailed.AnnotationDistance.to.TSSNearest.PromoterIDEntrez.IDNearest.UnigeneNearest.RefseqNearest.EnsemblGene.NameGene.AliasGene.DescriptionGene.Type
<int><chr><int><int><chr><int><lgl><chr><chr><int><chr><int><lgl><chr><chr><chr><chr><chr><chr>
1 326chr112109130121091616+0NA5' UTR (NM_023324, exon 1 of 7) 5' UTR (NM_023324, exon 1 of 7)134NM_023324 67245NANM_023324ENSMUSG00000020134Peli1 2810468L03Rik|A930031K15Rik|D11Ertd676epellino 1 protein-coding
21282chr177555137575551695+0NAintron (NM_001357860, intron 1 of 6)CpG 411NM_001357860 72722NANM_133747ENSMUSG00000002017Fam98a2810405J04Rik family with sequence similarity 98, member A protein-coding
32324chr6 3911768439119019+0NApromoter-TSS (NR_045813) promoter-TSS (NR_045813) -2NM_172893 243771NANM_172893ENSMUSG00000038507Parp129930021O16|ARTD12|PARP-12|Zc3hdc1 poly (ADP-ribose) polymerase family, member 12protein-coding
In [6]:
# Read a space or tab-separated file named "my_data.txt" in your working directory
te_near_ESC_peaks_df <- read.table("overlap_3163_repeatmasker.txt", header = FALSE, sep = "")
In [7]:
head(te_near_ESC_peaks_df, n=3)
A data.frame: 3 × 6
V1V2V3V4V5V6
<chr><int><int><chr><int><chr>
1chr163828416382862GC_rich21+
2chr163832056383216GC_rich24+
3chr199674159967442GC_rich27+
In [8]:
colnames(te_near_ESC_peaks_df) <- c("chr", "start", "end", "repeatmaskerDesc", "val", "plusMinus")
In [9]:
head(te_near_ESC_peaks_df, n=3)
A data.frame: 3 × 6
chrstartendrepeatmaskerDescvalplusMinus
<chr><int><int><chr><int><chr>
1chr163828416382862GC_rich21+
2chr163832056383216GC_rich24+
3chr199674159967442GC_rich27+
In [10]:
all_mouse_gene_ENSEMBL_IDs_and_gene_names_df <- read.csv("all_mouse_gene_ENSEMBL_IDs_and_gene_names.txt")
head(all_mouse_gene_ENSEMBL_IDs_and_gene_names_df, n=3)
A data.frame: 3 × 5
Gene.stable.IDGene.stable.ID.versionTranscript.stable.IDTranscript.stable.ID.versionGene.name
<chr><chr><chr><chr><chr>
1ENSMUSG00000064336ENSMUSG00000064336.1ENSMUST00000082387ENSMUST00000082387.1mt-Tf
2ENSMUSG00000064337ENSMUSG00000064337.1ENSMUST00000082388ENSMUST00000082388.1mt-Rnr1
3ENSMUSG00000064338ENSMUSG00000064338.1ENSMUST00000082389ENSMUST00000082389.1mt-Tv
In [11]:
all_mouse_gene_ENSEMBL_IDs_and_gene_names_df[all_mouse_gene_ENSEMBL_IDs_and_gene_names_df$Gene.name == 'Meioc', ]
A data.frame: 3 × 5
Gene.stable.IDGene.stable.ID.versionTranscript.stable.IDTranscript.stable.ID.versionGene.name
<chr><chr><chr><chr><chr>
212473ENSMUSG00000051455ENSMUSG00000051455.14ENSMUST00000156590ENSMUST00000156590.8Meioc
212474ENSMUSG00000051455ENSMUSG00000051455.14ENSMUST00000100378ENSMUST00000100378.4Meioc
212475ENSMUSG00000051455ENSMUSG00000051455.14ENSMUST00000155813ENSMUST00000155813.2Meioc
In [12]:
nrow(all_mouse_gene_ENSEMBL_IDs_and_gene_names_df)
278396
In [13]:
# Remove duplicates based only on the 'Gene.stable.ID' column, keeping the first occurrence
all_unique_mouse_genes_ids_df <- all_mouse_gene_ENSEMBL_IDs_and_gene_names_df[!duplicated(all_mouse_gene_ENSEMBL_IDs_and_gene_names_df$Gene.stable.ID), ]
In [14]:
nrow(all_unique_mouse_genes_ids_df)
78334

Create a function to generate scatter plots¶

In [15]:
# The createScatterPlots function receives these as inputs: 
# 1. Count Table file, cntTableFile -- e.g., "TEtranscripts_GRCm38_E10_777tm1d_2KO_males_vs_1WT_female_1WT_male_non_stranded.cntTable"
# 2. Number of samples in the experimental/treatment group, TGroupNum -- e.g., 2
# 3. Number of samples in the control group, CGroupNum -- e.g., 2
# 4. Number of gene names with high logfold2change values to include as text labels in plot, GeneTELabelNum -- e.g., 20
# 5. Plot width, plotWidth -- e.g., default == 6
# 6. Plot height, plotHeight -- e.g., default == 5
# 7. Plot dpi, plotDpi -- e.g., default == 300

# The createScatterPlots function returns:
# 1. Gene scatter plot
# 2. TE scatter plot

createScatterPlots <- function(cntTableFile, TGroupNum, CGroupNum, GeneTELabelNum, plotWidth = 6, plotHeight = 5, plotDpi = 300) {

    # Read cntTableFile
    data <- read.table(cntTableFile,header=T,row.names=1)

    # Get treatment and control groups
    groups <- factor(c(rep("TGroup",TGroupNum),rep("CGroup",CGroupNum)))

    # Generate dds
    min_read <- 1
    data <- data[apply(data,1,function(x){max(x)}) > min_read,]
    sampleInfo <- data.frame(groups,row.names=colnames(data))
    suppressPackageStartupMessages(library(DESeq2))
    dds <- DESeqDataSetFromMatrix(countData = data, colData = sampleInfo, design = ~ groups)
    dds$groups = relevel(dds$groups,ref="CGroup")
    dds <- DESeq(dds)

    # Get res via DESeq2 results function
    res <- results(dds,independentFiltering=F)
    resSig <- res[(!is.na(res$padj) & (res$padj < 0.050000) & (abs(res$log2FoldChange)> 0.000000)), ]

    # Normalize counts per sample
    norm <- counts(dds, normalized = TRUE)

    # Define treatment and control groups
    grp <- colData(dds)$groups  # factor with levels like TGroup / CGroup

    # Mean normalized count in each condition
    x_ctrl <- rowMeans(norm[, grp == "CGroup", drop=FALSE])
    y_trt  <- rowMeans(norm[, grp == "TGroup", drop=FALSE])
    
    # Assemble plotting table
    df <- data.frame(
      id = rownames(norm),
      baseMean_C = x_ctrl,
      baseMean_T = y_trt,
      log10_C = log10(x_ctrl + 1),
      log10_T = log10(y_trt + 1),
      log2FC = res$log2FoldChange,
      padj  = res$padj,
      stringsAsFactors = FALSE
    )

    # Separate by Genes and TEs

    # ---------------------------------------------------------------------------------------------------------
    # --- Genes -----------------------------------------------------------------------------------------------
    # ---------------------------------------------------------------------------------------------------------

    df_genes = df[grep("ENSMUS", row.names(df)),]

    # Remove dots and chars following them from id column
    df_genes$id <- sub("\\..*", "", df_genes$id)

    # Add chip_target boolean column to df_genes: TRUE if a gene is near an ESC ChIP-seq peak, FALSE otherwise
    df_genes <- df_genes %>%
      mutate(chip_target = id %in% genes_near_ESC_peaks_df$Nearest.Ensembl)

    # Add gene column to store gene name via a left join with the annotated ESC ChIP-seq peak file
    df_genes <- df_genes %>%
      left_join(
        genes_near_ESC_peaks_df %>%
          select(Nearest.Ensembl, Gene.Name),
        by = c("id" = "Nearest.Ensembl")
      ) %>%
      rename(gene = Gene.Name)

    # Add a DE column if significant 
    df_genes$DE <- (!is.na(df_genes$padj) & (df_genes$padj < 0.050000) & (abs(df_genes$log2FC)> 0.000000))
    
    # Specify categories for plot colors
    df_genes$category <- "not_DE"
    df_genes$category[df_genes$DE & !df_genes$chip_target] <- "DE_no_peak"
    df_genes$category[df_genes$DE &  df_genes$chip_target] <- "DE_with_peak"

    # Left join df_genes with all unique mouse gene names based on ENSEMBL id
    df_genes <- df_genes %>%
      left_join(
        all_unique_mouse_genes_ids_df %>%
          select(Gene.stable.ID, Gene.name),
        by = c("id" = "Gene.stable.ID")
      ) %>%
      rename(gene_ENSEMBL_join = Gene.name)

    # Fill NA gene_ENSEMBL_join names with ENSEMBL id
    df_genes$gene_ENSEMBL_join[is.na(df_genes$gene_ENSEMBL_join)] <-
      df_genes$id[is.na(df_genes$gene_ENSEMBL_join)]
    
    # Get top GeneLabelNum (e.g., top 20 if user inputs 20 for GeneTELabelNum) gene names in the correct order
    
    # Subset DE genes
    df_genes_DE <- subset(
      df_genes,
      category == "DE_with_peak" | category == "DE_no_peak"
    )

    sorted_df_genes_DE <- df_genes_DE %>%
      arrange(desc(abs(log2FC)))

    if (nrow(sorted_df_genes_DE) < GeneTELabelNum) {
    print("GeneTELabelNum exceeds number of DE genes. Using max number possible instead.")
    top_DE_genes <- sorted_df_genes_DE$gene_ENSEMBL_join[1:nrow(sorted_df_genes_DE)]
    } else {
      top_DE_genes <- sorted_df_genes_DE$gene_ENSEMBL_join[1:GeneTELabelNum]
    }
    
    label_genes <- c(top_DE_genes)

    # Generate Plots

    # Parse the cntTableFile for relevant labels
    # -- Treatment group description
    treatmentDesc <- sub(
      "TEtranscripts_GRCm38_(.*)_vs_.*",
      "\\1",
      cntTableFile
    )
    # -- Control group description
    controlDesc <- sub(
      ".*_vs_(.*)_non_stranded\\.cntTable",
      "\\1",
      cntTableFile
    )
    
    # # -- Gene Plot --
    # p <- ggplot(df_genes, aes(x = log10_C, y = log10_T)) +
    #   geom_point(data = subset(df_genes, category == "not_DE"),
    #              alpha = 0.25, size = 0.8, color = "grey70") +
    #   geom_point(data = subset(df_genes, category == "DE_no_peak"),
    #              alpha = 0.8, size = 1.2, color = "black") +
    #   geom_point(data = subset(df_genes, category == "DE_with_peak"),
    #              alpha = 0.9, size = 1.3, color = "orange3", shape = 15) +
    #   # geom_text(df_DE_with_peak$id) + 
    #         geom_text_repel(
    #           data = subset(df_genes, gene_ENSEMBL_join %in% label_genes),
    #           aes(
    #               label = gene_ENSEMBL_join, 
    #               color = category
    #           ),
    #           size = 2.5,
    #           max.overlaps = Inf
    #         ) +
    #       scale_color_manual(
    #         values = c(
    #           not_DE = "grey70",
    #           DE_no_peak = "black",
    #           DE_with_peak = "orange3"
    #         )
    #       ) +
    #   geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
    #   coord_equal() +
    #   labs(
    #     x = paste0(controlDesc, " log10(normalizedCount + 1)"),
    #     y = paste0(treatmentDesc, " log10(normalizedCount + 1)"),
    #     title = ""
    #   ) +
    #   theme_classic()

    # p <- p +
    #   labs(
    #     title = str_wrap(
    #       paste0(
    #         treatmentDesc, " vs ", controlDesc,
    #         " normalized gene differential expression top ",length(label_genes)," DE"
    #       ),
    #       width = 40
    #     )
    #   )

    # --- your existing scatter (unchanged) ---
    p <- ggplot(df_genes, aes(x = log10_C, y = log10_T)) +
      geom_point(data = subset(df_genes, category == "not_DE"),
                 alpha = 0.25, size = 0.8, color = "grey70") +
      geom_point(data = subset(df_genes, category == "DE_no_peak"),
                 alpha = 0.8, size = 1.2, color = "black") +
      geom_point(data = subset(df_genes, category == "DE_with_peak"),
                 alpha = 0.9, size = 1.3, color = "orange3", shape = 15) +
      geom_text_repel(
        data = subset(df_genes, gene_ENSEMBL_join %in% label_genes),
        aes(label = gene_ENSEMBL_join, color = category),
        size = 2.5, max.overlaps = Inf
      ) +
      scale_color_manual(values = c(not_DE = "grey70",
                                    DE_no_peak = "black",
                                    DE_with_peak = "orange3")) +
      geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
      coord_equal() +
      labs(
        x = paste0(controlDesc, " log10(normalizedCount + 1)"),
        y = paste0(treatmentDesc, " log10(normalizedCount + 1)"),
        title = str_wrap(
          paste0(treatmentDesc, " vs ", controlDesc,
                 " normalized gene differential expression top ",
                 length(label_genes), " DE labeled"),
          width = 40
        )
      ) +
      theme_classic()

    x_left <- min(df_genes$log10_C, na.rm = TRUE)
    y_top  <- max(df_genes$log10_T, na.rm = TRUE)

    # Get percentage of DEs that have peak
    numDEsWithPeak = nrow(df_genes_DE[df_genes_DE$category == "DE_with_peak", ])
    percentDEsWithPeak = numDEsWithPeak / nrow(df_genes_DE)

    # Get number of upregulated DEs
    nPositiveDEs = nrow(df_genes_DE[df_genes_DE$log2FC > 0, ])
    nNegativeDEs = nrow(df_genes_DE[df_genes_DE$log2FC < 0, ])

    # Get number of downregulated DEs
    
    p <- p +
      annotate("text", x = x_left + 2.0, y = y_top,
               label = paste0(sprintf("%.0f", percentDEsWithPeak*100),"%"),
               color = "orange3",
               hjust = 0, vjust = 1, size = 3) +
      annotate("text", x = x_left + 4.3, y = y_top,
               label = paste0("n = ", as.character(nPositiveDEs)),
               hjust = 0, vjust = 1, size = 3) +
      annotate("text", x = x_left + 4.8, y = y_top - 2.4,
               label = paste0("n = ", as.character(nNegativeDEs)),
               hjust = 0, vjust = 1, size = 3)

    
    # --- create a summary table for the pie ---
    df_counts <- df_genes_DE %>%
      count(category) %>%
      # ensure categories appear in same order/colors as scatter
      mutate(category = factor(category, levels = c("DE_no_peak", "DE_with_peak")),
             pct = n / sum(n))
    
    # --- small pie chart (no legend, compact) ---
    pie_colors <- c(DE_no_peak = "black", DE_with_peak = "orange3")
    
    p_pie <- ggplot(df_counts, aes(x = 1, y = n, fill = category)) +
      geom_col(width = 1, color = NA) +
      coord_polar(theta = "y") +
      scale_fill_manual(values = pie_colors) +
      theme_void() +
      theme(legend.position = "none") +
      # small annotation with counts (optional)
      geom_text(aes(label = n),
                position = position_stack(vjust = 0.5),
                size = 3,
                color = "white"
               )
    
    # --- combine with cowplot: place pie in top-left ---
    # x and y are [0,1] canvas coordinates; y=1 is top.
    combined <- ggdraw() +
      draw_plot(p, 0, 0, 1, 1) +                          # main scatter covers whole canvas
      draw_plot(p_pie, x = 0.12, y = 0.62, width = 0.23, height = 0.23) # tweak these
    
    # show it
    print(combined)

    ggsave(
        paste0(treatmentDesc, "_vs_", controlDesc, "_normalized_gene_differential_expression_top_",length(label_genes),"_DE.png"), 
        combined, 
        width = plotWidth, 
        height = plotHeight, 
        dpi = plotDpi
    )

    # # Print gene plot
    # print(p)    

    # ---------------------------------------------------------------------------------------------------------
    # --- TEs -------------------------------------------------------------------------------------------------
    # ---------------------------------------------------------------------------------------------------------

    df_te = df[-grep("ENSMUS", row.names(df)),]


    df_te$te_name1 <- sub(":.*", "", df_te$id)


    # Add chip_target boolean column to df_te: TRUE if a TE is near an ESC ChIP-seq peak (via RepeatMasker),
    #     FALSE otherwise
    df_te <- df_te %>%
      mutate(chip_target = te_name1 %in% te_near_ESC_peaks_df$repeatmaskerDesc)

    # Note: use te_name1 column as TE name for labels

    # Add a DE column if significant 
    df_te$DE <-  (!is.na(df_te$padj) & (df_te$padj < 0.050000) & (abs(df_te$log2FC)> 0.000000))
    
    # Specify categories for plot colors
    df_te$category <- "not_DE"
    df_te$category[df_te$DE & !df_te$chip_target] <- "DE_no_peak"
    df_te$category[df_te$DE &  df_te$chip_target] <- "DE_with_peak"
    
    # Get top GeneLabelNum (e.g., top 20 if user inputs 20 for GeneLabelNum) gene names in the correct order
    # Subset DE TEs
    df_te_DE <- subset(
      df_te,
      category == "DE_with_peak" | category == "DE_no_peak"
    )

    sorted_df_te_DE <- df_te_DE %>%
      arrange(desc(abs(log2FC)))

    if (nrow(sorted_df_te_DE) < GeneTELabelNum) {
    print("GeneTELabelNum exceeds number of DE TEs. Using max number possible instead.")
    top_DE_te <- sorted_df_te_DE$te_name1[1:nrow(sorted_df_te_DE)]
    } else {
      top_DE_te <- sorted_df_te_DE$te_name1[1:GeneTELabelNum]
    }

    label_te <- c(top_DE_te)

    
    # q <- ggplot(df_te, aes(x = log10_C, y = log10_T)) +
    #   geom_point(data = subset(df_te, category == "not_DE"),
    #              alpha = 0.25, size = 0.8, color = "grey70") +
    #   geom_point(data = subset(df_te, category == "DE_no_peak"),
    #              alpha = 0.8, size = 1.2, color = "black") +
    #   geom_point(data = subset(df_te, category == "DE_with_peak"),
    #              alpha = 0.9, size = 1.3, color = "orange3", shape = 15) +
    #   # geom_text(df_DE_with_peak$id) + 
    #         geom_text_repel(
    #           data = subset(df_te, te_name1 %in% label_te),
    #           aes(
    #               label = te_name1,
    #               color = category
    #           ),
    #           size = 2.5,
    #           max.overlaps = Inf
    #         ) +
    #       scale_color_manual(
    #         values = c(
    #           not_DE = "grey70",
    #           DE_no_peak = "black",
    #           DE_with_peak = "orange3"
    #         )
    #       ) +
    #   geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
    #   coord_equal() +
    #   labs(
    #     x = paste0(controlDesc, " log10(normalizedCount + 1)"),
    #     y = paste0(treatmentDesc, " log10(normalizedCount + 1)"),
    #     title = ""
    #   ) +
    #   theme_classic()
    
    # q <- q +
    #   labs(
    #     title = str_wrap(
    #       paste0(
    #         treatmentDesc, " vs ", controlDesc,
    #         " normalized transposable element differential expression top ",length(label_te)," DE"
    #       ),
    #       width = 40
    #     )
    #   )
    
    # ggsave(
    #     paste0(treatmentDesc, "_vs_", controlDesc, "_normalized_transposable_element_differential_expression_top_",length(label_te),"_DE.png"), 
    #     q, 
    #     width = plotWidth, 
    #     height = plotHeight, 
    #     dpi = plotDpi
    # )    
    
    # # Print TE plot
    # print(q)

    # ------------------------------------------------------------------------------------------------------------------------
    # ------------------------------------------------------------------------------------------------------------------------
    # ------------------------------------------------------------------------------------------------------------------------
    # ------------------------------------------------------------------------------------------------------------------------
    # ------------------------------------------------------------------------------------------------------------------------
    # ------------------------------------------------------------------------------------------------------------------------
    # ------------------------------------------------------------------------------------------------------------------------
    # ------------------------------------------------------------------------------------------------------------------------

    numTEtoDisplay <- ""

    if (nrow(df_te_DE) == 0) {
      numTEtoDisplay <- "0"
    } else {
      numTEtoDisplay <- as.character(length(label_te))
    }

    
    # --- your existing scatter (unchanged) ---
    q <- ggplot(df_te, aes(x = log10_C, y = log10_T)) +
      geom_point(data = subset(df_te, category == "not_DE"),
                 alpha = 0.25, size = 0.8, color = "grey70") +
      geom_point(data = subset(df_te, category == "DE_no_peak"),
                 alpha = 0.8, size = 1.2, color = "black") +
      geom_point(data = subset(df_te, category == "DE_with_peak"),
                 alpha = 0.9, size = 1.3, color = "orange3", shape = 15) +
          geom_text_repel(
              data = subset(df_te, te_name1 %in% label_te),
              aes(
                  label = te_name1,
                  color = category
              ),
              size = 2.5,
              max.overlaps = Inf
            ) +
      scale_color_manual(values = c(not_DE = "grey70",
                                    DE_no_peak = "black",
                                    DE_with_peak = "orange3")) +
      geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
      coord_equal() +
      labs(
        x = paste0(controlDesc, " log10(normalizedCount + 1)"),
        y = paste0(treatmentDesc, " log10(normalizedCount + 1)"),
        title = str_wrap(
          paste0(treatmentDesc, " vs ", controlDesc,
                 " normalized transposable element differential expression top ",
                 numTEtoDisplay, " DE labeled"),
          width = 40
        )
      ) +
      theme_classic()

    x_left <- min(df_te$log10_C, na.rm = TRUE)
    y_top  <- max(df_te$log10_T, na.rm = TRUE)

    # Get percentage of DEs that have peak
    numTeDEsWithPeak = nrow(df_te_DE[df_te_DE$category == "DE_with_peak", ])
    percentTeDEsWithPeak = numTeDEsWithPeak / nrow(df_te_DE)

    # Get number of upregulated DEs
    nPositiveTeDEs = nrow(df_te_DE[df_te_DE$log2FC > 0, ])
    nNegativeTeDEs = nrow(df_te_DE[df_te_DE$log2FC < 0, ])

    # Get number of downregulated DEs

    if (numTEtoDisplay != "0") {
        q <- q +
          annotate("text", x = x_left + 2.0, y = y_top,
                   label = paste0(sprintf("%.0f", percentTeDEsWithPeak*100),"%"),
                   color = "orange3",
                   hjust = 0, vjust = 1, size = 3)
        }

    
    q <- q +
      annotate("text", x = x_left + 3.5, y = y_top,
               label = paste0("n = ", as.character(nPositiveTeDEs)),
               hjust = 0, vjust = 1, size = 3) +
      annotate("text", x = x_left + 4.2, y = y_top - 3.0,
               label = paste0("n = ", as.character(nNegativeTeDEs)),
               hjust = 0, vjust = 1, size = 3)

    
    # --- create a summary table for the pie ---
    df_TE_counts <- df_te_DE %>%
      count(category) %>%
      # ensure categories appear in same order/colors as scatter
      mutate(category = factor(category, levels = c("DE_no_peak", "DE_with_peak")),
             pct = n / sum(n))
    
    # --- small pie chart (no legend, compact) ---
    pie_TE_colors <- c(DE_no_peak = "black", DE_with_peak = "orange3")
    
    q_pie <- ggplot(df_TE_counts, aes(x = 1, y = n, fill = category)) +
      geom_col(width = 1, color = NA) +
      coord_polar(theta = "y") +
      scale_fill_manual(values = pie_TE_colors) +
      theme_void() +
      theme(legend.position = "none") +
      # small annotation with counts (optional)
      geom_text(aes(label = n),
                position = position_stack(vjust = 0.5),
                size = 3,
                color = "white"
               )
    
    # --- combine with cowplot: place pie in top-left ---
    # x and y are [0,1] canvas coordinates; y=1 is top.
    combined_q <- ggdraw() +
      draw_plot(q, 0, 0, 1, 1) +                          # main scatter covers whole canvas
      draw_plot(q_pie, x = 0.12, y = 0.62, width = 0.23, height = 0.23) # tweak these
    
    # show it
    print(combined_q)



    
    # print(nrow(df_te_DE))
    # print(length(label_te))

    ggsave(
        paste0(treatmentDesc, "_vs_", controlDesc, "_normalized_transposable_element_differential_expression_top_",numTEtoDisplay,"_DE.png"), 
        combined_q, 
        width = plotWidth, 
        height = plotHeight, 
        dpi = plotDpi
    )
    
}
In [16]:
topDEnum <- 20
In [17]:
createScatterPlots("TEtranscripts_GRCm38_E10_777tm1d_2KO_males_vs_1WT_female_1WT_male_non_stranded.cntTable",2,2,topDEnum)
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

[1] "GeneTELabelNum exceeds number of DE TEs. Using max number possible instead."
No description has been provided for this image
No description has been provided for this image
In [18]:
createScatterPlots("TEtranscripts_GRCm38_E8_777tm1a_KO_vs_WT_females_non_stranded.cntTable",10,5,topDEnum)
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

-- replacing outliers and refitting for 105 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

estimating dispersions

fitting model and testing

[1] "GeneTELabelNum exceeds number of DE TEs. Using max number possible instead."
Warning message:
“No shared levels found between `names(values)` of the manual scale and the
data's colour values.”
Warning message:
“No shared levels found between `names(values)` of the manual scale and the
data's fill values.”
No description has been provided for this image
No description has been provided for this image
In [19]:
createScatterPlots("TEtranscripts_GRCm38_E8_777tm1a_KO_vs_WT_males_non_stranded.cntTable",3,5,topDEnum)
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

[1] "GeneTELabelNum exceeds number of DE TEs. Using max number possible instead."
Warning message:
“No shared levels found between `names(values)` of the manual scale and the
data's colour values.”
Warning message:
“No shared levels found between `names(values)` of the manual scale and the
data's fill values.”
No description has been provided for this image
No description has been provided for this image
In [20]:
createScatterPlots("TEtranscripts_GRCm38_mESC_R1_777KO_EGFP_excised_vs_WT_non_stranded.cntTable",2,2,topDEnum)
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

[1] "GeneTELabelNum exceeds number of DE TEs. Using max number possible instead."
No description has been provided for this image
No description has been provided for this image
In [21]:
createScatterPlots("TEtranscripts_GRCm38_mF9_OE_pCMV6_777-HA_vs_pSB_mock_non_stranded.cntTable",2,2,topDEnum)
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

[1] "GeneTELabelNum exceeds number of DE TEs. Using max number possible instead."
No description has been provided for this image
No description has been provided for this image
In [22]:
createScatterPlots("TEtranscripts_GRCm38_mF9_OE_pCMV6_777-R297W-HA_vs_pSB_mock_non_stranded.cntTable",2,2,topDEnum)
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

[1] "GeneTELabelNum exceeds number of DE TEs. Using max number possible instead."
No description has been provided for this image
No description has been provided for this image
In [28]:
# Not running mMEF E13 at this time due to 
createScatterPlots("TEtranscripts_GRCm38_mMEF_E13_DUFKZFP_cluster_KO_vs_WT_males_non_stranded.cntTable",2,2,topDEnum)
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

[1] "GeneTELabelNum exceeds number of DE TEs. Using max number possible instead."
Warning message:
“No shared levels found between `names(values)` of the manual scale and the
data's colour values.”
Warning message:
“No shared levels found between `names(values)` of the manual scale and the
data's fill values.”
No description has been provided for this image
No description has been provided for this image
In [24]:
createScatterPlots("TEtranscripts_GRCm38_mMEF_E15_777-R297W-KI_vs_E15_WT_males_non_stranded.cntTable",3,2,topDEnum)
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

No description has been provided for this image
No description has been provided for this image
In [25]:
createScatterPlots("TEtranscripts_GRCm38_mMEF_E15_777-R297W-KI_vs_E16_WT_females_non_stranded.cntTable",3,2,topDEnum)
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

No description has been provided for this image
No description has been provided for this image
In [26]:
createScatterPlots("TEtranscripts_GRCm38_mMEF_E15_777KO_tm1a_vs_WT_females_non_stranded.cntTable",2,2,topDEnum)
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

[1] "GeneTELabelNum exceeds number of DE TEs. Using max number possible instead."
No description has been provided for this image
No description has been provided for this image
In [27]:
createScatterPlots("TEtranscripts_GRCm38_mMEF_E15_777KO_tm1a_vs_WT_males_non_stranded.cntTable",2,2,topDEnum)
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

[1] "GeneTELabelNum exceeds number of DE TEs. Using max number possible instead."
No description has been provided for this image
No description has been provided for this image
In [ ]: